#!/usr/bin/env python
# Create: 2019/1/10
__author__ = '749B'

from twisted.web.client import getPage, defer
from twisted.internet import reactor
import queue


class Request(object):
    """封装请求的url和回调函数"""

    def __init__(self, url, callback):
        self.url = url
        self.callback = callback


class Scheduler(object):
    """调度器"""

    def __init__(self, engine):
        self.engine = engine
        self.q = queue.Queue()

    def enqueue_request(self, request):
        """添加任务"""
        self.q.put(request)

    def next_request(self):
        """获取下一个任务"""
        try:
            req = self.q.get(block=False)
        except queue.Empty:
            req = None
        return req

    def size(self):
        return self.q.qsize()


class ExecutionEngine(object):
    """引擎"""

    def __init__(self):
        self._close_wait = None  # stop_deferred
        self.start_requests = None
        self.scheduler = Scheduler(self)
        self.in_progress = set()  # 正在执行中的任务

    def _next_request(self):
        while self.start_requests:
            request = next(self.start_requests, None)
            if request:
                self.scheduler.enqueue_request(request)
            else:
                self.start_requests = None
        while len(self.in_progress) < 5 and self.scheduler.size() > 0:  # 最大编发为5
            request = self.scheduler.next_request()
            if not request:
                break
            self.in_progress.add(request)
            d = getPage(bytes(request.url, encoding='utf-8'))
            # addCallback是正确返回的时候执行，还有addErrback是返回有错误的时候执行
            # addBoth就是上面两种情况返回都会执行
            d.addBoth(self._handle_downloader_output, request)
            d.addBoth(lambda x, req: self.in_progress.remove(req), request)
            d.addBoth(lambda x: self._next_request())
        if len(self.in_progress) == 0 and self.scheduler.size() == 0:
            self._close_wait.callback(None)

    def _handle_downloader_output(self, response, request):
        import types
        gen = request.callback(response)
        if isinstance(gen, types.GeneratorType):  # 是否为生成器类型
            for req in gen:
                # 这里还可以再加判断，如果是request对象则继续爬取
                # 如果是item对象，则可以交给pipline
                self.scheduler.enqueue_request(req)

    @defer.inlineCallbacks
    def open_spider(self, start_requests):
        self.start_requests = start_requests
        yield None
        reactor.callLater(0, self._next_request)  # 过多少秒之后，执行后面的函数

    @defer.inlineCallbacks
    def start(self):
        """原来的stop函数"""
        self._close_wait = defer.Deferred()
        yield self._close_wait


@defer.inlineCallbacks
def crawl(start_requests):
    """原来的task函数"""
    engine = ExecutionEngine()
    start_requests = iter(start_requests)
    yield engine.open_spider(start_requests)
    yield engine.start()


def all_done(arg):
    print("All Done")
    reactor.stop()


def one_done(response):
    print(response)


count = 0
def chouti(response):
    """任务返回后生成新的Request继续交给调度器执行"""
    global count
    count += 1
    print(response)
    if count > 3:
        return None
    for i in range(10):
        yield Request("http://dig.chouti.com/all/hot/recent/%s" % i, lambda x: print(len(x)))


def chouti2():
    n = 0

    def func(response):
        print(response)
        nonlocal n
        n += 1
        if n > 3:
            return None
        for i in range(10):
            yield Request("http://dig.chouti.com/all/hot/recent/%s" % i, lambda x: print(len(x)))
    return func


if __name__ == '__main__':
    url_list = [
        'http://www.bing.com',
        'https://www.baidu.com',
        'http://edu.51cto.com',
    ]
    # requests = [Request(url, callback=one_done) for url in url_list]
    # requests = [Request(url, callback=chouti) for url in url_list]
    callback = chouti2()
    requests = [Request(url, callback=callback) for url in url_list]
    ret = crawl(requests)
    ret.addBoth(all_done)
    reactor.run()

